library(rafalib)
library(edgeR)
library(rtracklayer)
library(ggplot2)
library(cowplot)
library(reshape2)
library(ggdendro)
library(ggbeeswarm)
library(grid)
library(gridExtra)
library(ggpubr)
library(pheatmap)
library(stringr)
library(ggbeeswarm)
The data used in this analysis were generated in a study of LXR activation in AOM/DSS, a mouse model of colorectal cancer. Colon samples were collected at four time points: Prior to AOM (day 0), and ~7 days after each of three cycles of DSS (days 22, 43 and 70). LXR agonist GW3965 was given from start of protocol. To achieve a “day 0” diet control, non-AOM/DSS samples were collected after 22 or 43 days of standard or GW3965 diet. As such, “day 0” and “day 22/43 Control” are used interchangeably.
Samples were used for bulk RNASeq, and raw sequences were quantified and annotated (with ENSEMBL ID) using Kallisto. Counts and TPMs from Kallisto-generated .tsv-files are combined into one matrix each and annotated with gene symbols. Count, TPM and metadata matrices are ordered to match sample order across matrices.
if(!all(file.exists("analysis/gene_tpm.tsv","analysis/gene_counts.tsv"))){
if(!file.exists("raw_data/metadata.csv")){
# Download metadata file if it cannot be found
}
filenames <- list.files("raw_data","R.*\\.tsv")
if(length(filenames)==0){
# Download kallisto count files if they cannot be found
filenames <- list.files("raw_data","R.*\\.tsv")
}
if(!all(file.exists("analysis/raw_counts.tsv","analysis/raw_tpm.tsv"))){
# Compile count and TPM tables from kallisto counts
raw_counts <- sapply(filenames,function(x){
read.delim(paste0("raw_data/",x))[,"est_counts"] })
colnames(raw_counts) <- sub(".tsv","",colnames(raw_counts))
rownames(raw_counts) <- read.delim(paste0("raw_data/",filenames[1]))[,"target_id"]
raw_tpm <- sapply(filenames,function(x){
read.delim(paste0("raw_data/",x))[,"tpm"] })
colnames(raw_tpm) <- sub(".tsv","",colnames(raw_tpm))
rownames(raw_tpm) <- read.delim(paste0("raw_data/",filenames[1]))[,"target_id"]
# Load metadata and match to counts data
metadata <- read.csv2("raw_data/metadata.csv", row.names = 1 )
common <- rownames(metadata)[ rownames(metadata) %in% colnames(raw_counts)]
metadata <- metadata[common, ]
raw_counts <- round(raw_counts[,common],0)
# raw_counts <- raw_counts[,common]
raw_tpm <- raw_tpm[,common]
# Save counts and TPM tables and metadata
write.table(raw_counts,paste0("analysis/raw_counts.tsv"))
write.table(raw_tpm,paste0("analysis/raw_tpm.tsv"))
write.csv2(metadata, "raw_data/metadata.csv")
}else{
raw_counts = read.table("analysis/raw_counts.tsv", row.names = 1)
raw_tpm = read.table("analysis/raw_tpm.tsv", row.names = 1)
metadata = read.csv2("raw_data/metadata.csv", row.names = 1)
}
# Translate to gene names
# Next, we can summarize ensembl transcript IDs to gene symbols for easier data interpretation. Here, we will download the GTF file from ensemble used for the annotation in the ammping step from Kallisto and annotate accordingly.
if(!file.exists( paste0("raw_data/Mus_musculus.GRCm38.101.gtf") )){
setwd("raw_data")
system("wget ftp://ftp.ensembl.org/pub/release-101/gtf/mus_musculus/Mus_musculus.GRCm38.101.gtf.gz")
system("gunzip Mus_musculus.GRCm38.101.gtf.gz")
setwd("../")
}
gtf <- import.gff(paste0("raw_data/Mus_musculus.GRCm38.101.gtf"))
gtf <- gtf[ ! is.na(gtf$transcript_id) ,]
#gtf <- gtf[ gtf$transcript_source == "ensembl_havana" ,]
#gtf <- gtf[ gtf$transcript_biotype == "protein_coding" ,]
annot <- gtf$gene_name [ match( sub("[.].*","",rownames(raw_counts)),gtf$transcript_id ) ]
annot[is.na(annot)] <- "NA"
counts <- rowsum(raw_counts , group = annot)
counts <- counts[rownames(counts) != "NA",]
tpm <- rowsum(raw_tpm , group = annot)
tpm <- tpm[rownames(tpm) != "NA",]
write.table(counts,"analysis/counts.tsv")
write.table(tpm,"analysis/tpm.tsv")
# Filter counts and TPM tables
sel <- rowSums( counts > 10 ) >= 3
filt_counts <- counts[sel,]
filt_tpm <- tpm[sel,]
# Save filtered tables
write.table(filt_counts,"analysis/gene_counts.tsv")
write.table(filt_tpm,"analysis/gene_tpm.tsv")
}else{
filt_counts = read.table("analysis/gene_counts.tsv")
filt_tpm = read.table("analysis/gene_tpm.tsv")
metadata <- read.csv2("raw_data/metadata.csv", row.names = 1 )
filt_counts = filt_counts[,rownames(metadata)]
counts = read.table("analysis/counts.tsv")
tpm = read.table("analysis/tpm.tsv")
}
oldcounts = read.table("/Users/jenfra/LocalWork/repos/e_villablanca_2005/analysis_bulk/counts.tsv")
oldcounts = oldcounts[,rownames(metadata)]
oldfilt = read.table("/Users/jenfra/LocalWork/repos/e_villablanca_2005/analysis_bulk/filt_counts.tsv")
oldfilt = oldfilt[,rownames(metadata)]
Homogeneity in sample quality is assessed through number of counts, number of features and % ribosomal RNA. Integrity values (RIN) provided by Novogene have also been included. The comparisons indicate some degradation of certain samples (lower RIN and higher % ribosome RNA), mostly AOM-DSS samples at day 43. This should be considered for further analyses that include day 43.
metadata$nFeatures <- colSums(filt_counts>0)
metadata$nCounts <- colSums(filt_counts)
is_mito <- grepl("mt-",rownames(filt_counts))
metadata$perc_mito <- colSums(filt_counts[is_mito,]) / colSums(filt_counts) * 100
is_ribo <- grepl("Rp[ls]",rownames(filt_counts))
metadata$perc_ribo <- colSums(filt_counts[is_ribo,]) / colSums(filt_counts) * 100
write.table(metadata,"analysis/metadata_QC.tsv")
metadata$diet <- factor(metadata$diet, levels = c("STD","GW3965"))
metadataqc = metadata
x <- c("nFeatures","nCounts","perc_ribo","perc_mito","RIN_values")
metadataqc$Group = paste(gsub("YES","AOM-DSS",
gsub("NO","Ctrl", as.character(metadataqc$AOM_DSS))),
metadataqc$day, metadataqc$diet)
metadataqc$Group = factor(metadataqc$Group, levels = paste(rep(c("Ctrl","AOM-DSS"),c(6,6)), rep(rep(c("22","43","70"),c(2,2,2)),2),rep(c("STD","GW3965"),6)))
glist = list()
for(i in x){
for(g in c("Group", "cage")){
imean = aggregate(metadataqc[,i],by=list(metadataqc[,g]),mean)
imd = metadataqc
imd = imd[order(imd[,g]),]
imd$sample_ID = factor(imd$sample_ID, levels = imd$sample_ID)
imd$mean = imean$x[match(imd[,g], imean$Group.1)]
imeanx = aggregate(as.numeric(imd$sample_ID), by = list(imd[,g]), mean)
glist[[g]][[i]] = ggplot(imd, aes_string(y = i, x = "as.numeric(sample_ID)")) +
geom_bar(aes_string(fill = g), stat = "identity", position = "dodge", show.legend = FALSE, alpha = 0.8) +
theme(panel.background = element_blank(),
axis.text.x = element_text(angle = 30, hjust = 1, size = 10),
axis.title.x = element_blank()) +
geom_line(aes_string(y = "mean", group = g)) +
scale_x_continuous(breaks = imeanx[,2], labels = imeanx[,1])
}
}
plot_grid(textGrob("Model, day and diet", hjust = 0.5, gp=gpar(fontsize=16)),
plot_grid(plotlist = glist[["Group"]], ncol = 1, align = "v"),ncol = 1, rel_heights = c(0.05,1))
plot_grid(textGrob("Cage", hjust = 0.5, gp=gpar(fontsize=16)),
plot_grid(plotlist = glist[["cage"]], ncol = 1, align = "v"),ncol = 1, rel_heights = c(0.05,1))
In order to get an overall view of the variation between samples, the 500 most variable genes are selected. The graph below shows the mean expression and standard deviation, with each dot representing one gene and the selected genes marked in red.
log_tpm <- log2(filt_tpm+1)
log_sd <- apply(log_tpm,1,sd)
log_means <- apply(log_tpm,1,mean)
HVGs <- names(sort(log_sd,T))[1:500] # highly variable genes
plot(log_means,log_sd,
col=ifelse(names(log_sd) %in% HVGs,"red","black" ),
cex=.5,pch=16)
lines(smooth.spline(log_means,log_sd,spar = 1,nknots = 50),col="blue",lwd=3)
Using the log-transformed expression of the 500 highly variable genes, a PCA is performed to visualize the main variation between samples. Different aspects of the metadata are overlayed on the visualization in order to identify contributing factors. In general, samples can be grouped by non-AOM-DSS and AOM-DSS, and between days in the case of AOM-DSS. Differences between diets are not visible in the first principal components. Differences in measures of integrity are not reflected on the first two principal components.
PC <- prcomp( t(log_tpm[HVGs,]) ,center = T, scale. = T )
PCvar <- PC$sdev^2 / sum(PC$sdev^2) * 100
names(PCvar) <- 1:length(PCvar)
elbow = ggplot(data.frame(variance = PCvar, PC = as.numeric(names(PCvar))), aes(x = PC, y = variance)) +
geom_point() + geom_line() +
theme(panel.background = element_blank()) +
labs(y = "% Variance explained")
pcdata <- cbind(PC$x, metadata)
g <- ggplot(pcdata, aes(x = PC1,y = PC2, color = factor(day), shape = AOM_DSS)) +
geom_point(size = 4) + theme_classic() +
ggrepel::geom_text_repel(aes(label = diet))
plot_grid(elbow,g, ncol = 2, rel_widths = c(0.5,1))
ggplot(pcdata, aes(x = PC1,y = PC3, color = factor(day), shape = AOM_DSS)) +
geom_point(size = 4) + theme_classic() +
ggrepel::geom_text_repel(aes(label = diet))
g1<- ggplot(pcdata, aes(x = PC1,y = PC2)) + geom_point(aes(size = RIN_values)) +
theme_classic() + theme(legend.position = "bottom")
g2 <- ggplot(pcdata, aes(x = PC1,y = PC2)) + geom_point(aes(size = perc_ribo)) +
theme_classic() + theme(legend.position = "bottom")
plot_grid(g1,g2, ncol = 2)
Using the log-transformed expression of the 500 highly variable genes, hierarchical clustering is performed to identify groups of similar samples.
Samples cluster imperfectly based on AOM-DSS and day when examining all samples.
# Scale each measurement (independently) to have a mean of 0 and variance of 1
hvgdata.scaled <- t(log_tpm[HVGs,])
hvgdata.scaled <- as.data.frame(scale(hvgdata.scaled))
# Run clustering
hvgdata.matrix <- as.matrix(hvgdata.scaled)
hvgdata.dendro <- as.dendrogram(hclust(d = dist(x = hvgdata.matrix), method = "ward.D2"))
gene.dendro <- as.dendrogram(hclust(d = dist(x = t(hvgdata.matrix)), method = "ward.D2"))
p1_dendro = dendro_data(hvgdata.dendro)
# Create dendrogram plot
dendro.plot <- ggdendrogram(data = hvgdata.dendro, rotate = FALSE, theme_dendro = FALSE) +
theme(axis.text.y = element_blank(), axis.text.x = element_text(size = 12)) +
coord_cartesian(xlim = c(0.5, nrow(hvgdata.scaled) + 0.5), expand = FALSE) +
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
gene.dendro.plot <- ggdendrogram(data = gene.dendro, rotate = TRUE) +
theme(axis.text.y = element_blank())
# Heatmap
hvgdata.withnames <- as.data.frame(hvgdata.scaled)
hvgdata.withnames$names <- rownames(hvgdata.scaled)
# Data wrangling
hvgdata.long <- as.data.frame(melt(hvgdata.withnames, id = c("names")))
# Extract the order of the tips in the dendrogram
hvgdata.order <- order.dendrogram(hvgdata.dendro)
gene.order <- order.dendrogram(gene.dendro)
# Order the levels according to their position in the cluster
hvgdata.long$names <- factor(x = hvgdata.long$names,
levels = rownames(hvgdata.scaled)[hvgdata.order],
ordered = TRUE)
hvgdata.long$variable <- factor(x = hvgdata.long$variable,
levels = colnames(hvgdata.scaled)[gene.order],
ordered = TRUE)
hvgdata.metadata <- metadata
hvgdata.metadata$AOM_DSS <- as.character(hvgdata.metadata$AOM_DSS)
hvgdata.metadata$AOM_DSS[hvgdata.metadata$AOM_DSS == "YES"] <- "AOMDSS"
hvgdata.metadata$AOM_DSS[hvgdata.metadata$AOM_DSS == "NO"] <- "Ctrl"
hvgdata.metadata$names <- rownames(hvgdata.metadata)
metadata.long <- as.data.frame(melt(hvgdata.metadata, id = c("names")))
metadata.long$names <- factor(x = metadata.long$names,
levels = rownames(hvgdata.scaled)[hvgdata.order],
ordered = TRUE)
metadata.long$value <- factor(x = metadata.long$value,
levels = c("22","43","70","AOMDSS","Ctrl","STD","GW3965",unique(metadata.long$variable[!(metadata.long$variable %in% c("22","43","70","AOMDSS","Ctrl","STD","GW3965"))])),
ordered = TRUE)
# Create heatmap plot
heatmap.plot <- ggplot(data = hvgdata.long, aes(x = names, y = variable)) +
geom_tile(aes(fill = value)) +
scale_fill_gradient2(low = "#000088",high = "#880000", mid = "white") +
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
legend.position = "right",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank()) + labs(fill = "Expression")
# Create metadata heatmap plot
metadata.heatmap.plot <- ggplot(data = metadata.long[metadata.long$variable %in% c("diet","AOM_DSS","day"),], aes(x = names, y = variable)) +
geom_tile(aes(fill = value)) +
scale_fill_ordinal() +
theme(axis.text.x = element_blank(),
axis.text.y = element_text(size = 12),
axis.title = element_blank(),
axis.ticks = element_blank(),
legend.position = "right",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank()) + labs(fill = "Group")
plot_grid(dendro.plot, metadata.heatmap.plot, heatmap.plot, ncol = 1, align = "v", axis = "lr", rel_heights = c(0.3,0.1,0.6))
Samples cluster imperfectly based on day when examining AOM-DSS samples. The only group in which STD and GW cluster separately based on these genes is among the day 22 AOM-DSS samples.
# Scale each measurement (independently) to have a mean of 0 and variance of 1
hvgdata.scaled <- t(log_tpm[HVGs,metadata$AOM_DSS %in% "YES"])
hvgdata.scaled <- as.data.frame(scale(hvgdata.scaled))
# Run clustering
hvgdata.matrix <- as.matrix(hvgdata.scaled)
hvgdata.dendro <- as.dendrogram(hclust(d = dist(x = hvgdata.matrix), method = "ward.D2"))
gene.dendro <- as.dendrogram(hclust(d = dist(x = t(hvgdata.matrix)), method = "ward.D2"))
p1_dendro = dendro_data(hvgdata.dendro)
# Create dendrogram plot
dendro.plot <- ggdendrogram(data = hvgdata.dendro, rotate = FALSE, theme_dendro = FALSE) +
theme(axis.text.y = element_blank(), axis.text.x = element_text(size = 12)) +
coord_cartesian(xlim = c(0.5, nrow(hvgdata.scaled) + 0.5), expand = FALSE) +
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
gene.dendro.plot <- ggdendrogram(data = gene.dendro, rotate = TRUE) +
theme(axis.text.y = element_blank())
# Heatmap
hvgdata.withnames <- as.data.frame(hvgdata.scaled)
hvgdata.withnames$names <- rownames(hvgdata.scaled)
# Data wrangling
hvgdata.long <- as.data.frame(melt(hvgdata.withnames, id = c("names")))
# Extract the order of the tips in the dendrogram
hvgdata.order <- order.dendrogram(hvgdata.dendro)
gene.order <- order.dendrogram(gene.dendro)
# Order the levels according to their position in the cluster
hvgdata.long$names <- factor(x = hvgdata.long$names,
levels = rownames(hvgdata.scaled)[hvgdata.order],
ordered = TRUE)
hvgdata.long$variable <- factor(x = hvgdata.long$variable,
levels = colnames(hvgdata.scaled)[gene.order],
ordered = TRUE)
hvgdata.metadata <- metadata[metadata$AOM_DSS %in% "YES",]
hvgdata.metadata$AOM_DSS <- as.character(hvgdata.metadata$AOM_DSS)
hvgdata.metadata$AOM_DSS[hvgdata.metadata$AOM_DSS == "YES"] <- "AOMDSS"
hvgdata.metadata$AOM_DSS[hvgdata.metadata$AOM_DSS == "NO"] <- "Ctrl"
hvgdata.metadata$names <- rownames(hvgdata.metadata)
metadata.long <- as.data.frame(melt(hvgdata.metadata, id = c("names")))
metadata.long$names <- factor(x = metadata.long$names,
levels = rownames(hvgdata.scaled)[hvgdata.order],
ordered = TRUE)
metadata.long$value <- factor(x = metadata.long$value,
levels = c("22","43","70","AOMDSS","Ctrl","STD","GW3965",unique(metadata.long$variable[!(metadata.long$variable %in% c("d22","d43","d70","AOMDSS","Ctrl","STD","GW3965"))])),
ordered = TRUE)
# Create heatmap plot
heatmap.plot <- ggplot(data = hvgdata.long, aes(x = names, y = variable)) +
geom_tile(aes(fill = value)) +
scale_fill_gradient2(low = "#000088",high = "#880000", mid = "white") +
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
legend.position = "right",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank()) + labs(fill = "Expression")
# Create metadata heatmap plot
metadata.heatmap.plot <- ggplot(data = metadata.long[metadata.long$variable %in% c("diet","day"),], aes(x = names, y = variable)) +
geom_tile(aes(fill = value)) +
scale_fill_ordinal() +
theme(axis.text.x = element_blank(),
axis.text.y = element_text(size = 12),
axis.title = element_blank(),
axis.ticks = element_blank(),
legend.position = "right",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank()) + labs(fill = "Group")
plot_grid(dendro.plot, metadata.heatmap.plot, heatmap.plot, ncol = 1, align = "v", axis = "lr", rel_heights = c(0.3,0.1,0.6))
Fold-change between diet groups is calculated by fitting the data to a linear model y ~ (day & diet) in edgeR. By examining the difference between the two diet groups at day 22, we retrieve a ranked list genes based on fold-change, which is used as input in the desktop GSEA tool.
# Defining groups
groupstring <- paste(metadata$AOM_DSS, metadata$day, metadata$diet)
groupstringlevels <- sort(unique(groupstring))
group <- factor(groupstring,
levels=groupstringlevels[c(grep("STD",groupstringlevels),
grep("GW3965",groupstringlevels))])
# y_old = DGEList(counts=oldcounts,group=group)
# keep_old <- filterByExpr(y_old)
# y_old <- y_old[keep_old,,keep.lib.sizes=FALSE]
# y_old <- calcNormFactors(y_old)
#
# design <- model.matrix(~group)
#
# y_old <- estimateDisp(y_old,design)
#
y <- DGEList(counts=counts,group=group)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
design <- model.matrix(~group)
y <- estimateDisp(y,design)
# Fitting the data to the model
fit <- glmQLFit(y,design)
# fit_old <- glmQLFit(y_old,design)
# Generating all desired comparisons
compvars <- colnames(fit)
qlf_diet_day22 <- glmQLFTest(fit,contrast = ((compvars ==
paste0("group",
"YES 22 GW3965"))*1) +
((compvars==paste0("group", "YES 22 STD"))*-1))
# qlf_diet_day22_old <- glmQLFTest(fit_old,contrast = ((compvars ==
# paste0("group",
# "YES 22 GW3965"))*1) +
# ((compvars==paste0("group", "YES 22 STD"))*-1))
write.table(file = "analysis/day22_GWvsDMSO_rankedlist.RNK", qlf_diet_day22$table[order(qlf_diet_day22$table$logFC),"logFC",drop=FALSE],col.names = FALSE, quote = FALSE, sep = "\t")
loadGeneSets <- function(wd = getwd()){
if(file.exists(paste0(wd,"/tools/mouseKEGG.RData"))){
load(paste0(wd,"/tools/mouseKEGG.RData"),envir = .GlobalEnv)
}else{
mmupaths <- keggList("pathway",organism = "mmu")
mmupathgenes <- list()
mmupathnames <- list()
for(i in 1:length(mmupaths)){
mmupathgenes[[i]] <- keggGet(names(mmupaths)[i])[[1]]$GENE
mmupathnames[[i]] <- keggGet(names(mmupaths)[i])[[1]]$NAME
}
names(mmupathgenes) <- names(mmupaths)
names(mmupathnames) <- names(mmupaths)
mmupathgenesclean <- list()
for(i in 1:length(mmupathgenes)){
if(!is.null(mmupathgenes[[i]])){
mmupathgenesclean[[i]] <- mmupathgenes[[i]][seq(2,length(mmupathgenes[[i]]),2)]
mmupathgenesclean[[i]] <- gsub(";.*", "", mmupathgenesclean[[i]])
}
}
names(mmupathgenesclean) <- names(mmupathgenes)
kegg_pathways <- mmupathgenesclean
names(kegg_pathways) <- mmupathnames
save(kegg_pathways, file = paste0(wd,"/tools/mouseKEGG.RData"))
}
if(file.exists(paste0(wd,"/tools/mouseGOBP.RData"))){
load(paste0(wd,"/tools/mouseGOBP.RData"),envir = .GlobalEnv)
}else{
if(!require(KEGGREST)){
BiocManager::install("clusterProfiler")
library(clusterProfiler)
}
if(!require(org.Mm.eg.db)){
BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
}
allgenes <- rownames(read.table("analysis_bulk/counts.tsv"))
gobp_pathways_entrez <- list()
goterms <- AnnotationDbi::Ontology(GO.db::GOTERM)
goterms <- goterms[goterms == "BP"]
getGOLevel(names(goterms))
go2genes <- suppressMessages(AnnotationDbi::mapIds(org.Mm.eg.db, keys=names(goterms), column="SYMBOL",
keytype="GOALL", multiVals='list'))
gobp_pathways_goID_all <- go2genes
gobp_pathways_all <- gobp_pathways_goID_all
names(gobp_pathways_all) <- Term(GO.db::GOTERM)[match(names(go2genes), keys(GO.db::GOTERM))]
presentgenes <- sapply(gobp_pathways, function(x){sum(!is.na(match(allgenes, x)))})
gobp_pathways_goID <- gobp_pathways_goID_all[presentgenes>3 & presentgenes < 3000]
gobp_pathways <- gobp_pathways_all[presentgenes>3 & presentgenes<3000]
save(gobp_pathways, file = paste0(wd,"/tools/mouseGOBP.RData"))
save(gobp_pathways_goID, file = paste0(wd,"/tools/mouseGOBP_GOID.RData"))
save(gobp_pathways_all, file = paste0(wd,"/tools/mouseGOBP_all.RData"))
save(gobp_pathways_goID_all, file = paste0(wd,"/tools/mouseGOBP_GOID_all.RData"))
}
pathways_list <- list(kegg_pathways, gobp_pathways)
names(pathways_list) <- c("KEGG","BP")
assign("pathways_list",pathways_list, envir = .GlobalEnv)
}
loadGeneSets()
keggnames <- gsub(" - Mus musculus \\(mouse\\)","",names(pathways_list[["KEGG"]]))
tableGraph <- function(sigRes){
sigRes <- sigRes[order(sigRes$NES, decreasing = FALSE),]
sigRes <- sigRes[abs(sigRes$ES)>0.5,]
sigRes$showpathway <- gsub("\\.\\.\\.MUS.MUSCULUS\\.\\.MOUSE\\.","",sigRes$NAME)
sigRes$showpathway <- translator[match(sigRes$showpathway,translator[,1]),2]
sigRes$showpathway <- factor(sigRes$showpathway,levels = sigRes$showpathway)
g1 <- ggplot(sigRes, aes(y = showpathway, x = ES, fill = ES)) +
geom_bar(aes(x = NES), stat = "identity", fill = "#BBBBBB") +
geom_bar(stat = "identity") +
theme_classic() +
scale_fill_gradient2(low = "#4b4bdb", high = "#db4b4b") +
theme(panel.grid.major.y = element_line(color = "grey", size = 0.1),
axis.line.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_line(color = "grey", size = 0.1),
axis.text = element_text(size = 8),
axis.title.x = element_text(size = 8),
legend.position = "none"
) +
labs(x = "ES / NES")
g2 <- ggplot(sigRes, aes(y = showpathway, x = FDR.q.val)) + geom_point(size = 0.75) +
theme_classic() +
scale_x_continuous(breaks = c(0,0.05), limits = c(0,0.05)) + #expand = expansion(mult = 1.05)) +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 8),
axis.title.x = element_text(size = 8),
panel.grid.major.y = element_line(color = "grey", size = 0.1)) +
labs(x = "q-value")
return(list(g1,g2))
}
getFiles <- function(dir){
tsvfiles <- list.files(dir, pattern = "tsv$")
gseafiles <- tsvfiles[grep("gsea_report", tsvfiles)]
upfile <- gseafiles[grep("na_pos", gseafiles)]
downfile <- gseafiles[grep("na_neg", gseafiles)]
uptable <- read.table(paste0(dir, "/", upfile), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
downtable <- read.table(paste0(dir, "/", downfile), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
return(list(up = uptable, down = downtable))
}
log_tpm <- log2(tpm+1)
day22logFC <- getFiles("analysis/gsea_results")
allPaths <- c(day22logFC[["up"]]$NAME,day22logFC[["down"]]$NAME)
allPathnames <-gsub("\\.\\.\\.MUS.MUSCULUS\\.\\.MOUSE\\.","",allPaths)
translator <- cbind(allPathnames,keggnames[match(gsub("\\.","-",tolower(allPathnames)), gsub("\\/","-",gsub(",","-",gsub("\\(","-",gsub("\\)","-",gsub(" ","-",tolower(keggnames)))))))])
FDRlimit = 0.05
ESlimit = 0
returnSig <- function(gseaTable){
return(gseaTable$NAME[gseaTable$FDR.q.val<FDRlimit & abs(gseaTable$ES)>ESlimit])
}
day22up <- returnSig(day22logFC$up)
day22down <- returnSig(day22logFC$down)
plot_grid(tableGraph(rbind(day22logFC$up[match(day22up, day22logFC$up$NAME),1:11],
day22logFC$down[match(day22down, day22logFC$down$NAME),1:11]))[[1]],
ncol = 1, align = "v")
PPARday22 <- read.table("analysis/gsea_results/PPAR.SIGNALING.PATHWAY...MUS.MUSCULUS..MOUSE..tsv",
sep="\t", stringsAsFactors = FALSE, header = TRUE)
samples = which(metadata$AOM_DSS=="YES" & metadata$day=="22")
genedf <- log_tpm[PPARday22$SYMBOL[PPARday22$CORE.ENRICHMENT=="Yes"],
samples]
logFCs <- qlf_diet_day22$table[order(qlf_diet_day22$table$logFC),"logFC",drop=FALSE][PPARday22$SYMBOL[PPARday22$CORE.ENRICHMENT=="Yes"],]
names(logFCs) <- PPARday22$SYMBOL[PPARday22$CORE.ENRICHMENT=="Yes"]
genedf <- genedf[!is.na(match(rownames(genedf),names(logFCs))),]
fcorder <- order(match(rownames(genedf),names(sort(logFCs, decreasing = TRUE))))
if(sum(logFCs>0)==0){fcorder <- rev(fcorder)}
genedf <- genedf[fcorder,]
hmdf <- t(apply(genedf[!is.na(genedf[,1]),],1,scale))
hmdf <- hmdf[!is.nan(hmdf[,1]),]
colnames(hmdf) <- colnames(genedf)
bluewhitered <- colorRampPalette(c("#4b4bdb","white","#db4b4b"))(1000)
colorscale <- bluewhitered[max(c(round((min(hmdf)/2+1)*500),0)):
min(c(round((max(hmdf)/2+1)*500),1000))]
maxFC = 5
logFCs[logFCs>maxFC] <- maxFC
logFCs[logFCs<(-maxFC)] <- -maxFC
logFCAnnotColors <- colorRampPalette(c("white","white","#2f007e"))(301)[(round(min(logFCs[rownames(hmdf)], na.rm = TRUE),1)*(150/maxFC)+151):
(round(max(logFCs[rownames(hmdf)], na.rm = TRUE),1)*(150/maxFC)+151)]
phm <- pheatmap(hmdf, clustering_method = "ward.D2", cluster_rows = FALSE,
cluster_cols = TRUE,
annotation_colors = list(diet = c(GW3965 = "#bdc9e2",STD = "#dfdfde"),
logFC = logFCAnnotColors),
annotation_col = (metadata[match(colnames(genedf),
rownames(metadata)),
"diet", drop = FALSE]),
annotation_row = data.frame(logFC = round(logFCs, 1), row.names = names(logFCs)),
col = colorscale, show_colnames = FALSE, # border_color = "#CCCCCC",
#cex = ifelse(nrow(hmdf)>60,0.75,1),
main = "PPAR signaling pathway, day 22")
DE genes are defined using
y ~ day * diet + % ribosomal RNA
### Functions for evaluation and visualization of modules
keggbargraphs <- function(thisclustering, universe, pathways = kegg_pathways, nPaths = 15){
nclusters = max(thisclustering)
clustercolors = gg_color_hue(nclusters)
glist = list()
for(clusteri in 1:max(thisclustering)){
genes_cluster = names(thisclustering)[thisclustering == clusteri]
kegg_cluster = NULL
while(is.null(kegg_cluster)){
kegg_cluster <- ownORA(genes_cluster, universe = universe, pathways = pathways)
}
if(nrow(kegg_cluster)>0){
kegg_cluster <- kegg_cluster[order(kegg_cluster$Adjusted.P.value),]
#kegg_cluster[1:3,]
kegg_cluster <- kegg_cluster[ as.numeric(sub("[/].*","",kegg_cluster$Overlap)) >= 4 , ]
kegg_cluster <- kegg_cluster[ 1:nPaths , ]
kegg_cluster$shortGenes <- lapply(kegg_cluster$Genes,
function(x){paste(strsplit(x,";")[[1]][1:8][
!is.na(strsplit(x,";")[[1]][1:8])],
collapse = ", ")})
notDone = TRUE
len = 28
while(length(unique(kegg_cluster$shortTerm[!is.na(kegg_cluster$shortTerm)]))!=
length(kegg_cluster$shortTerm[!is.na(kegg_cluster$shortTerm)]) | notDone){
kegg_cluster$shortTerm[!is.na(kegg_cluster$Term)] <-
sapply(as.character(kegg_cluster$Term)[!is.na(kegg_cluster$Term)],
function(x){
if(nchar(x)>(len+2)){
x <- paste0(substr(gsub("positive","pos",
gsub("negative","neg",
gsub("regulation","reg",
x))),1,len),"...")};
return(x)})
len = len+1
notDone = FALSE
}
if(!is.null(kegg_cluster$shortTerm)){
kegg_cluster$shortTerm <- factor(kegg_cluster$shortTerm,
levels = unique(rev(kegg_cluster$shortTerm)))
}else{
kegg_cluster$shortTerm <- NA
}
{
b <- ggplot(kegg_cluster[1:sum(!is.na(kegg_cluster$Term)),],
aes(x = -log10(Adjusted.P.value),
y = shortTerm)) +
geom_bar(stat ="identity", fill = clustercolors[clusteri]) +
theme_classic() +
theme(axis.line = element_blank(), axis.title.y = element_blank()) +
labs(title = paste0("Module ",clusteri," (",length(genes_cluster)," genes)")) +
geom_vline(xintercept = -log10(0.05), linetype = "dashed", color = "#BBBBBB") +
geom_text(x = 0,
aes(label = paste0(" ",Overlap,": ",shortGenes)),
hjust = 0, size = 3) +
xlim(c(0, max(5,max(-log10(kegg_cluster[1:nPaths,"Adjusted.P.value"]), na.rm = TRUE))))
}
}
glist <- c(glist,list(b))
}
return(glist)
}
arrangePlots <- function(gl, nr=2, nc=3, nplots = nr*nc, ...){
for(i in seq(1,length(gl), nplots)){
grid.arrange(grobs = gl[i:min(c((i+nplots-1), length(gl)))], ncol = nc,
nrow = nr, ...)
}
}
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
ownORA <- function(genelist, pathways, universe){
### Adapted from enricher_internal from DOSE
intersects <- list()
for(i in 1:length(pathways)){
intersects[[i]] <- list(intersect(pathways[[i]], genelist),intersect(pathways[[i]], universe))
}
names(intersects) <- names(pathways)
args.df <- data.frame(numWdrawn = sapply(intersects,function(x){length(x[[1]])})-1, ## White balls drawn
numW = sapply(intersects,function(x){length(x[[2]])}), ## White balls
numB = length(unique(universe))-sapply(intersects,function(x){length(x[[2]])}), ## Black balls
numDrawn = length(genelist)) ## balls drawn
rownames(args.df) <- names(pathways)
args.df <- args.df[args.df[,2]>3,]
pvalues <- apply(args.df, 1, function(n)
phyper(n[1], n[2], n[3], n[4], lower.tail=FALSE)
)
terms = rownames(args.df)
overlaps = paste(args.df[,1]+1,args.df[,2],sep="/")
#Term Overlap P.value Adjusted.P.value Genes
adjpvalues = p.adjust(pvalues, method = "BH")
genes = lapply(intersects[rownames(args.df)],function(x){paste(x[[1]], collapse=";")})
res <- data.frame(Term = terms,
Overlap = overlaps,
P.value = pvalues,
Adjusted.P.value = adjpvalues,
Genes = unlist(genes),
row.names = NULL,
stringsAsFactors = FALSE)
return(res)
}
toplist <- topTags(daydiet_diet_lrt, n = nrow(daydiet_diet_lrt))
allgenes <- rownames(toplist$table)[toplist$table$FDR<0.05 ]
#allgenes <- rownames(toplist$table)[toplist$table$FDR<0.05 & minfdrdiet[rownames(toplist$table)]<0.2]
logFCs = allLogFCswide[allgenes,order(colnames(allLogFCswide))]
load("/Users/jenfra/LocalWork/repos/e_villablanca_2005/analysis_bulk/modules/allgroups_dietgenes_cth20_ribo.RData")
clustering_old = clustering
logFCs_scaled <- t(apply(logFCs[apply(logFCs, 1, sd)!=0,],1,scale))
colnames(logFCs_scaled) <- colnames(logFCs)
distance <- dist(logFCs_scaled, method = "euclidean")
clustering_ <- hclust(distance, method = "ward.D2")
ct <- cutree(clustering_, h =20)
clustering <- match(ct,unique(ct[clustering_$order]))
names(clustering) <- names(ct)
save(clustering, file = "analysis/dietmodules.RData")
table(clustering[intersect(names(clustering), names(clustering_old))],
clustering_old[intersect(names(clustering), names(clustering_old))])
barplots <- keggbargraphs(thisclustering = clustering, universe = rownames(toplist))
gobpbarplots <- keggbargraphs(thisclustering = clustering, universe = rownames(toplist),
pathways = gobp_pathways)
##
## 1 2 3 4 5 6 7 8 9
## 1 41 0 2 0 0 9 61 0 11
## 2 191 4 17 0 33 5 3 0 0
## 3 0 1 47 0 3 15 8 0 4
## 4 78 99 34 0 0 0 6 0 3
## 5 0 0 0 163 4 27 1 4 10
## 6 0 0 2 19 83 184 12 0 7
## 7 0 11 0 10 5 1 118 11 7
## 8 0 0 0 0 0 0 3 63 14
## 9 0 0 10 4 0 0 2 2 178
nclusters = max(clustering)
clustercolors = gg_color_hue(nclusters)
clusters <- data.frame(clustering)
clusters$clustering <- factor(clusters$clustering)
n = max(as.numeric(clusters[,1]))
hues = seq(n, 375, length = n + 1)
colortranslator = hcl(h = hues, l = 65, c = 100)[1:n]
names(colortranslator) <- 1:n
cols = gg_color_hue(max(clustering))
pheatmap(logFCs_scaled[names(sort(clustering)),],
cluster_rows = FALSE, show_rownames = FALSE,
cluster_cols = FALSE,gaps_row = which(sort(clustering) !=
sort(clustering)[c(2:length(clustering),1)]),
color = colorRampPalette(c("#0000CC","#FFFFFF","#CC0000"))(100))
toPlot <- as.data.frame(logFCs_scaled)
toPlot$Gene <- rownames(logFCs_scaled)
toPlotLong <- melt(toPlot, id = "Gene")
toPlotLong$Cluster <- factor(clusters[toPlotLong$Gene,1], c(sort(unique(clusters[,1])),16))
toPlotLong$Cluster[is.na(toPlotLong$Cluster)] <- 16
groupOrder = c("day0_STD", "day22_STD", "day43_STD", "day70_STD",
"day0_GW3965", "day22_GW3965", "day43_GW3965", "day70_GW3965")
toPlotLong$variable = factor(toPlotLong$variable, levels = groupOrder)
toPlotLong$Diet = factor(gsub(".*_","",toPlotLong$variable),levels = c("STD","GW3965"))
toPlotLong$Day = factor(gsub("_.*","",toPlotLong$variable))
nclusters = max(as.numeric(clusters[,1]))
clustercolors = c(gg_color_hue(nclusters),"#BBBBBB")
#### Plots with individual gene changes
logFCplots = list()
allLogFCs_ <- allLogFCs
allLogFCs_$Cluster <- clustering[allLogFCs$Gene]
for(i in 1:max(clustering)){
g <- ggplot(allLogFCs_[allLogFCs_$Cluster %in% i,], aes(x = as.numeric(as.factor(Day)),
y = value)) +
geom_line(alpha = 0.1, aes(group = paste(Gene,Diet), color = Diet)) +
geom_smooth(aes(group = paste(Diet)), color = "black",
method = "loess", formula = 'y~x', size = 2.2) +
geom_smooth(aes(color = Diet), method = "loess", formula = 'y~x',
size = 2, alpha = 0) +
theme(axis.title = element_blank(),
plot.title = element_text(hjust = 0.5),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
#axis.ticks.y = element_blank(),
#axis.text.y = element_blank(),
panel.background = element_blank(),
legend.key = element_blank()) +
scale_x_continuous(labels=sort(unique(allLogFCs_$Day))) +
labs(title= paste("Module",i))
logFCplots <- c(logFCplots, list(g))
}
#### Plots with average gene changes
glistdietstraight = list()
for(i in 1:max(clustering)){
yminval = abs(min(toPlotLong$value[toPlotLong$Cluster==i ]))
g <- ggplot(toPlotLong[toPlotLong$Cluster==i,], aes(x = as.numeric(Day),
y = value + yminval)) +
theme(axis.title = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
#axis.ticks.y = element_blank(),
#axis.text.y = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_continuous(labels=sort(unique(toPlotLong$Day)), breaks = 1:length(unique(toPlotLong$Day))) +
stat_summary(data =toPlotLong[toPlotLong$Cluster==i & toPlotLong$Diet=="STD",],
aes(group = Diet), geom="area", fill = clustercolors[i]) +
stat_summary(aes(group=Diet, linetype = Diet), fun=mean, geom="line") +
stat_summary(aes(group = Diet), geom="errorbar", width=0.1) +
scale_y_continuous(expand = expansion(mult = c(0.1,0.2))) +
ggpubr::stat_compare_means(aes(group = Diet), label = "p.signif")
glistdietstraight[[length(glistdietstraight)+1]] <- g
}
#### Plots with individual sample averages
glisterrort = list()
for(i in 1:max(clustering)){
clustertpms <- normCountsScaled[,c(names(clustering)[clustering==i],
"Day","Diet","Sample")]
clustertpmlong <- melt(clustertpms, id = c("Day","Diet", "Sample"))
clustermeans <- aggregate(clustertpmlong$value,
by =list(clustertpmlong$Day,
clustertpmlong$Diet,
clustertpmlong$Sample),
mean)
colnames(clustermeans) <- c("Day","Diet","Sample","value")
g <- ggplot(clustermeans,
aes(x = as.numeric(as.factor(Day)), y = value)) +
#geom_smooth(aes(color = paste(Diet)),
# method = "loess", formula = 'y~x', size = 1, alpha = 0.2) +
geom_point(aes(fill = Diet),
pch = 21, position=position_dodge(width = 0.5)) +
theme(axis.title = element_blank(),
plot.title = element_text(hjust = 0.5),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(),
#axis.ticks.y = element_blank(),
#axis.text.y = element_blank(),
panel.background = element_blank(),
legend.key = element_blank()) +
scale_x_continuous(labels=sort(unique(clustertpmlong$Day)),
breaks = 1:length(unique(clustertpmlong$Day))) +
scale_y_continuous(expand = expansion(mult = c(0.1,0.25))) +
scale_fill_manual(values=c(GW3965 = "#1f538d", STD="#dfdfde")) +
stat_summary(aes(group =Diet),geom="errorbar", position = position_dodge(width = 0.5),width = 0.1) +
ggpubr::stat_compare_means(aes(group = Diet),label = "p.signif", method = "t.test", paired = TRUE,
hide.ns = TRUE, label.y = max(clustermeans$value)*1.1)
#stat_compare_means(aes(group = Diet, label = paste0("p =", ..p.format..))) +
#labs(title= paste("Module",i))
glisterrort[[i]] <- g
}
### Combining and showing plots
combinedlist <- list()
for(i in 1:nclusters){
kegggobp <- rbind(gobpbarplots[[i]]$data[1:min(c(5,nrow(gobpbarplots[[i]]$data))),],
barplots[[i]]$data[1:min(c(5,nrow(barplots[[i]]$data))),])
kg <- ggplot(kegggobp,
aes(x = -log10(Adjusted.P.value),
y = shortTerm)) +
#geom_vline(xintercept = -log10(0.05), color = "black") +
geom_bar(stat ="identity", fill = clustercolors[i]) +
theme_classic() +
theme(axis.line = element_blank(), axis.title.y = element_blank()) +
geom_vline(xintercept = -log10(0.05), linetype = "dashed", color = "#BBBBBB") +
geom_text(x = 0,
aes(label = paste0(" ",Overlap,": ",shortGenes)),
hjust = 0, size = 3) +
xlim(c(0, max(5,max(-log10(kegggobp[,"Adjusted.P.value"]), na.rm = TRUE))))
tg <- textGrob(paste0("Module ",i,": ",table(clustering)[i]," genes"))
combinedlist <- c(combinedlist, list(tg),
list(logFCplots[[i]] + theme(plot.title = element_blank(), legend.position = "bottom")),
list(glistdietstraight[[i]] + theme(plot.title = element_blank(), legend.position = "bottom")),
list(glisterrort[[i]] + theme(plot.title = element_blank(), legend.position = "bottom")),
list(kg))
}
arrangePlots(gl = combinedlist, nc = 4, nr = 4, widths = c(0.7,0.7,0.7,1),
layout_matrix = rbind(1,c(2,3,4,5),6,c(7,8,9,10)),
heights = c(0.2,1,0.2,1), nplots = 10)
glist=list()
for(i in 1:max(clustering)){
clustertpms <- normCountsScaled[,c(names(clustering)[clustering==i],
"Day","Diet","Sample")]
clustertpmlong <- melt(clustertpms, id = c("Day","Diet", "Sample"))
clustermeans <- aggregate(clustertpmlong$value,
by =list(clustertpmlong$Day,
clustertpmlong$Diet,
clustertpmlong$Sample),
mean)
colnames(clustermeans) <- c("Day","Diet","Sample","value")
clustermeans$RibosomalRNA <- metadata[clustermeans$Sample,"perc_ribo"]
g <- ggplot(clustermeans,
aes(x = RibosomalRNA, y = value)) +
geom_point(aes(color = Diet, shape = Day)
) +
theme(axis.title.y = element_blank(),
plot.title = element_text(hjust = 0.5),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
#axis.ticks.y = element_blank(),
#axis.text.y = element_blank(),
panel.background = element_blank(),
legend.key = element_blank(),
legend.position = "none") +
#stat_compare_means(aes(group = Diet, label = paste0("p =", ..p.format..))) +
labs(title= paste("Module",i))
glist <- c(glist, list(g))
}
ggarrange(plotlist = glist, ncol=3, nrow=2, common.legend = TRUE, legend="top")
glist=list()
for(i in 1:max(clustering)){
clustertpms <- normCountsScaled[,c(names(clustering)[clustering==i],
"Day","Diet","Sample")]
clustertpmlong <- melt(clustertpms, id = c("Day","Diet", "Sample"))
clustermeans <- aggregate(clustertpmlong$value,
by =list(clustertpmlong$Day,
clustertpmlong$Diet,
clustertpmlong$Sample),
mean)
colnames(clustermeans) <- c("Day","Diet","Sample","value")
clustermeans$RIN <- metadata[clustermeans$Sample,"RIN_values"]
g <- ggplot(clustermeans,
aes(x = RIN, y = value)) +
geom_point(aes(color = Diet, shape = Day)
) +
theme(axis.title.y = element_blank(),
plot.title = element_text(hjust = 0.5),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
#axis.ticks.y = element_blank(),
#axis.text.y = element_blank(),
panel.background = element_blank(),
legend.key = element_blank(),
legend.position = "none") +
#stat_compare_means(aes(group = Diet, label = paste0("p =", ..p.format..))) +
labs(title= paste("Module",i))
glist <- c(glist, list(g))
}
ggarrange(plotlist = glist, ncol=3, nrow=2, common.legend = TRUE, legend="top")
## $`1`
##
## $`2`
##
## attr(,"class")
## [1] "list" "ggarrange"
## $`1`
##
## $`2`
##
## attr(,"class")
## [1] "list" "ggarrange"
alltop5GO = list()
alltop5KEGG = list()
for(i in 1:length(gobpbarplots)){
top5GO = gobpbarplots[[i]]$data[1:min(c(5,nrow(gobpbarplots[[i]]$data))),]
top5KEGG = barplots[[i]]$data[1:min(c(5,nrow(barplots[[i]]$data))),]
top5GO = top5GO[top5GO$Adjusted.P.value<0.05,]
top5KEGG = top5KEGG[top5KEGG$Adjusted.P.value<0.05,]
if(!is.na(top5GO$Term[1])){
for(j in 1:nrow(top5GO)){
if(!is.null(alltop5GO[[top5GO$Term[j]]])){
alltop5GO[[top5GO$Term[j]]] = c(alltop5GO[[top5GO$Term[j]]], i)
}else{
alltop5GO[[top5GO$Term[j]]] = i
}
}
}
if(!is.na(top5KEGG$Term[1])){
for(j in 1:nrow(top5KEGG)){
if(!is.null(alltop5KEGG[[top5KEGG$Term[j]]])){
alltop5KEGG[[top5KEGG$Term[j]]] = c(alltop5KEGG[[top5KEGG$Term[j]]], i)
}else{
alltop5KEGG[[top5KEGG$Term[j]]] = i
}
}
}
}
allmods <- c(alltop5GO, alltop5KEGG)
pathways <- c(kegg_pathways[names(alltop5KEGG)],gobp_pathways[names(alltop5GO)])
pathways <- pathways[order(sapply(allmods,min))]
tNormCountsScaledPathways <- normCountsScaled[,c(unique(unlist(pathways)[!is.na(match(unlist(pathways),
colnames(normCountsScaled)))]),"Day","Diet","Sample")]
normCountsScaledPathways <- t(tNormCountsScaledPathways[,-((ncol(tNormCountsScaledPathways)-2):ncol(tNormCountsScaledPathways))])
gliststraightpaired = list()
for(m in sort(unique(unlist(allmods)))){
for(iname in names(allmods)[grep(m, allmods)]){
i = intersect(pathways[[iname]],names(clustering)[clustering == m])
g <- ggplot(toPlotLong[toPlotLong$Gene %in% i,], aes(x = as.numeric(Day),
y = value)) +
theme(axis.title = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(),
panel.background = element_blank(),
plot.title = element_text(size = 8,hjust = 0)
) +
scale_x_continuous(labels=gsub("ay","",sort(unique(toPlotLong$Day))), breaks = 1:length(unique(toPlotLong$Day))) +
labs(title = paste(gsub(" - Mus musculus \\(mouse\\)","",iname), "\nmodule", m)) +
stat_summary(aes(group=Diet, linetype = Diet, color = Diet), fun=mean, geom="line") +
stat_summary(aes(group = Diet), geom="errorbar", width=0.1) +
ggpubr::stat_compare_means(aes(group = Diet), label = "p.signif", paired = TRUE,
label.y = max(aggregate(toPlotLong[toPlotLong$Gene %in% i,]$value, by=
list(toPlotLong[toPlotLong$Gene %in% i,]$Diet,
toPlotLong[toPlotLong$Gene %in% i,]$Day),
mean)$x)*1.1) +
scale_color_manual(values=c(GW3965="red",STD="black")) +
scale_y_continuous(expand = expansion(mult = c(0.1,0.2)))
gliststraightpaired[[as.character(m)]][[length(gliststraightpaired[[as.character(m)]])+1]] <- g
}
}
glistmoduleerrort_smaller = list()
for(m in sort(unique(unlist(allmods)))){
for(iname in names(allmods)[grep(m, allmods)]){
i = intersect(intersect(pathways[[iname]],names(clustering)[clustering == m]),colnames(tNormCountsScaledPathways))
clustertpms <- tNormCountsScaledPathways[,c(i,
"Day","Diet","Sample")]
clustertpmlong <- melt(clustertpms, id = c("Day","Diet", "Sample"))
clustermeans <- aggregate(clustertpmlong$value,
by =list(clustertpmlong$Day,
clustertpmlong$Diet,
clustertpmlong$Sample),
mean)
colnames(clustermeans) <- c("Day","Diet","Sample","value")
g <- ggplot(clustermeans,
aes(x = as.numeric(as.factor(Day)), y = value)) +
geom_quasirandom(aes(fill = Diet),
pch = 21, dodge.width = 0.75, size = 1) +
scale_fill_manual(values = c(GW3965 = "#1f538d",STD = "#dfdfde")) +
theme(axis.title = element_blank(),
axis.line = element_line(),
plot.title = element_text(hjust = 0, size = 8),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
#axis.ticks.y = element_blank(),
#axis.text.y = element_blank(),
panel.background = element_blank(),
legend.key = element_blank()) +
scale_x_continuous(labels=sort(unique(clustertpmlong$Day)),
breaks = 1:length(unique(clustertpmlong$Day))) +
#stat_compare_means(aes(group = Diet, label = paste0("p =", ..p.format..))) +
labs(title= paste(gsub(" - Mus musculus \\(mouse\\)","",iname), "\nmodule",m)) +
stat_summary(aes(group =Diet),geom="errorbar", position = position_dodge(width = 0.75), width = 0.3) +
ggpubr::stat_compare_means(aes(group = Diet), label = "p.signif", method = "t.test", label.y = max(clustermeans$value)*1.2) +
scale_y_continuous(expand = expansion(mult = c(0.1,0.2)))
glistmoduleerrort_smaller[[as.character(m)]][[length(glistmoduleerrort_smaller[[as.character(m)]])+1]] <- g
}
#grid.arrange(grobs = glist)
}
newlist <- list()
for(m in names(gliststraightpaired)){
for(i in 1:length(gliststraightpaired[[m]])){
newlist[[m]] <- c(newlist[[m]], lapply(list(gliststraightpaired[[m]][[i]],glistmoduleerrort_smaller[[m]][[i]]),
function(x){x + labs() + theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "none",
plot.title=element_text(size = 8)) +
coord_cartesian(clip="off")
}))
}
}
newlist[[m]] = c(newlist[[m]], list(as_ggplot(get_legend(newlist[[m]][[1]]+ theme(legend.position = "right"))),as_ggplot(get_legend(newlist[[m]][[2]]+ theme(legend.position = "right")))))
for(m in names(newlist)){
arrangePlots(newlist[[m]], nr = 2, nc = 4)
}
modvar = as.data.frame(matrix(ncol = 4, nrow = length(unlist(allmods))))
samvar = as.data.frame(matrix(ncol = 4, nrow = length(unlist(allmods))))
moddiff = as.data.frame(matrix(ncol = 4, nrow = length(unlist(allmods))))
samdiff = as.data.frame(matrix(ncol = 4, nrow = length(unlist(allmods))))
colnames(modvar) = colnames(moddiff) = sort(unique(c("day0", "day22", "day43", "day70")))
colnames(samvar) = colnames(samdiff) = sort(unique(c("0", "22", "43", "70")))
rownames(samvar) = rownames(modvar) = rownames(moddiff) = rownames(samdiff) = unlist(lapply(1:length(allmods), function(x){paste(names(allmods)[x],allmods[[x]])}))[order(unlist(allmods))]
for(m in sort(unique(unlist(allmods)))){
for(iname in names(allmods)[grep(m, allmods)]){
i = intersect(pathways[[iname]],names(clustering)[clustering == m])
modpath = toPlotLong[toPlotLong$Gene %in% i,]
for(day in unique(modpath$Day)){
daymp = modpath[modpath$Day==day,]
wt = wilcox.test(daymp$value~daymp$Diet, paired = TRUE)
modvar[paste(iname, m),day] = wt$p.value
}
modpathmeans = aggregate(modpath$value, by = list(modpath$Day, modpath$Diet, modpath$variable), mean)
for(day in unique(modpath$Day)){
moddiff[paste(iname, m),day] = modpathmeans[modpathmeans$Group.1 == day & modpathmeans$Group.2 == "GW3965","x"] -
modpathmeans[modpathmeans$Group.1 == day & modpathmeans$Group.2 == "STD","x"]
}
clustertpms <- tNormCountsScaledPathways[,c(i,
"Day","Diet","Sample")]
clustertpmlong <- melt(clustertpms, id = c("Day","Diet", "Sample"))
clustermeans <- aggregate(clustertpmlong$value,
by =list(clustertpmlong$Day,
clustertpmlong$Diet,
clustertpmlong$Sample),
mean)
colnames(clustermeans) <- c("Day","Diet","Sample","value")
for(day in sort(unique(clustermeans$Day))){
daycm = clustermeans[clustermeans$Day==day,]
tt = t.test(daycm$value~daycm$Diet)
samvar[paste(iname, m),day] = tt$p.value
daycmmean = aggregate(daycm$value, by=list(daycm$Diet), mean)
samdiff[paste(iname, m),day] = daycmmean[daycmmean$Group.1 == "GW3965","x"] - daycmmean[daycmmean$Group.1 == "STD","x"]
}
}
}
moddiff$pathmod = rownames(moddiff)
modvar$pathmod = rownames(modvar)
ps = melt(modvar)
colnames(ps)[3] = "p_value"
ps$p_value[ps$p_value<0.001] = 1000
ps$p_value[ps$p_value<0.01] = 100
ps$p_value[ps$p_value<0.05] = 10
ps$p_value[ps$p_value<2] = ""
ps$p_value[ps$p_value==10] = "*"
ps$p_value[ps$p_value==100] = "**"
ps$p_value[ps$p_value==1000] = "***"
ggplot(merge(melt(moddiff),ps), aes(y = factor(pathmod, levels = rev(moddiff$pathmod)), x = variable, fill = value)) + geom_tile() +
geom_text(aes(label = p_value), vjust = 0.7) +
scale_fill_gradientn(limits = c(-max(abs(moddiff[,1:4])),max(abs(moddiff[,1:4]))),
colors=c("navyblue","white","firebrick")) + theme(panel.background = element_blank())
keggonly = merge(melt(moddiff),ps)
keggonly = keggonly[grep("Mus musculus",keggonly$pathmod),]
ggplot(keggonly, aes(y = factor(pathmod, levels = rev(moddiff$pathmod)), x = variable, fill = value)) + geom_tile() +
geom_text(aes(label = p_value), vjust = 0.7) +
scale_fill_gradientn(limits = c(-max(abs(keggonly$value)),max(abs(keggonly$value))),
colors=c("navyblue","white","firebrick")) +theme(panel.background = element_blank())
samdiff$pathmod = rownames(samdiff)
samvar$pathmod = rownames(samvar)
ps = melt(samvar)
colnames(ps)[3] = "p_value"
ps$p_value[ps$p_value<0.001] = 1000
ps$p_value[ps$p_value<0.01] = 100
ps$p_value[ps$p_value<0.05] = 10
ps$p_value[ps$p_value<2] = ""
ps$p_value[ps$p_value==10] = "*"
ps$p_value[ps$p_value==100] = "**"
ps$p_value[ps$p_value==1000] = "***"
ggplot(merge(melt(samdiff),ps), aes(y = factor(pathmod, levels = rev(samdiff$pathmod)), x = variable, fill = value)) + geom_tile() +
geom_text(aes(label = p_value), vjust = 0.7) +
scale_fill_gradientn(limits = c(-max(abs(samdiff[,1:4])),max(abs(samdiff[,1:4]))),
colors=c("navyblue","white","firebrick")) +theme(panel.background = element_blank())
keggonly = merge(melt(samdiff),ps)
keggonly = keggonly[grep("Mus musculus",keggonly$pathmod),]
ggplot(keggonly, aes(y = factor(pathmod, levels = rev(samdiff$pathmod)), x = variable, fill = value)) + geom_tile() +
geom_text(aes(label = p_value), vjust = 0.7) +
scale_fill_gradientn(limits = c(-max(abs(keggonly$value)),max(abs(keggonly$value))),
colors=c("navyblue","white","firebrick")) + theme(panel.background = element_blank())